commit
92e122297f
@ -0,0 +1,2 @@
|
|||||||
|
.venv/
|
||||||
|
src/__pycache__/
|
||||||
@ -0,0 +1,15 @@
|
|||||||
|
# Requirements
|
||||||
|
### Minimal versions
|
||||||
|
python 3.7
|
||||||
|
postgresql 11
|
||||||
|
|
||||||
|
### Database schema
|
||||||
|
includes GIN index column `geodata_search_idx`
|
||||||
|
```sql
|
||||||
|
ALTER TABLE geodata ADD COLUMN geodata_search_ts TSVECTOR
|
||||||
|
GENERATED ALWAYS AS (to_tsvector('russian', geodata.description) STORED
|
||||||
|
```
|
||||||
|
(use coalese() if multiple)
|
||||||
|
```sql
|
||||||
|
CREATE INDEX geodata_search_idx ON geodata USING GIN (geodata_search_ts);
|
||||||
|
```
|
||||||
@ -0,0 +1,15 @@
|
|||||||
|
{
|
||||||
|
"folders": [
|
||||||
|
{
|
||||||
|
"path": "."
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"settings": {
|
||||||
|
"python.defaultInterpreterPath": ".venv/bin/python",
|
||||||
|
"python.formatting.blackPath": "/usr/bin/black",
|
||||||
|
"python.linting.flake8Path": "/usr/bin/flake8",
|
||||||
|
"python.linting.flake8Args": [
|
||||||
|
"--black-config=''"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,18 @@
|
|||||||
|
anyio==3.5.0
|
||||||
|
asgiref==3.5.0
|
||||||
|
click==8.0.4
|
||||||
|
et-xmlfile==1.1.0
|
||||||
|
fastapi==0.74.1
|
||||||
|
greenlet==1.1.2
|
||||||
|
h11==0.13.0
|
||||||
|
idna==3.3
|
||||||
|
openpyxl==3.0.9
|
||||||
|
psycopg2==2.9.3
|
||||||
|
pydantic==1.9.0
|
||||||
|
python-multipart==0.0.5
|
||||||
|
six==1.16.0
|
||||||
|
sniffio==1.2.0
|
||||||
|
SQLAlchemy==1.4.31
|
||||||
|
starlette==0.17.1
|
||||||
|
typing_extensions==4.1.1
|
||||||
|
uvicorn==0.17.5
|
||||||
@ -0,0 +1,14 @@
|
|||||||
|
anyio==3.5.0
|
||||||
|
et-xmlfile==1.1.0
|
||||||
|
fastapi==0.74.1
|
||||||
|
greenlet==1.1.2
|
||||||
|
idna==3.3
|
||||||
|
openpyxl==3.0.9
|
||||||
|
psycopg2==2.9.3
|
||||||
|
pydantic==1.9.0
|
||||||
|
python-multipart==0.0.5
|
||||||
|
six==1.16.0
|
||||||
|
sniffio==1.2.0
|
||||||
|
SQLAlchemy==1.4.31
|
||||||
|
starlette==0.17.1
|
||||||
|
typing_extensions==4.1.1
|
||||||
@ -0,0 +1,62 @@
|
|||||||
|
from typing import List
|
||||||
|
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
from sqlalchemy.sql.expression import func
|
||||||
|
from sqlalchemy import inspect
|
||||||
|
|
||||||
|
from . import models, schemas
|
||||||
|
from .database import Base
|
||||||
|
|
||||||
|
|
||||||
|
def get_item(db: Session, item_id: int):
|
||||||
|
"""индивидуальные страницы для каждого Описания набора данных"""
|
||||||
|
return db.query(models.Item).filter(models.Item.id == item_id).first()
|
||||||
|
|
||||||
|
|
||||||
|
def get_item_by_description(db: Session, needle: str, skip: int = 0, limit: int = 20):
|
||||||
|
"""фильтры для поиска Описаний наборов данных по атрибутам
|
||||||
|
дополнительный возможный синтаксис в запросах преобразуется
|
||||||
|
в поисковый запрос содержащий:
|
||||||
|
- простой текст: переданные слова в любом порядке
|
||||||
|
- "текст в кавычках": переданные слова в указанном порядке
|
||||||
|
- OR: какие-либо из переданных слов
|
||||||
|
- -: не содержащий данного слова
|
||||||
|
"""
|
||||||
|
# hardcode russian for now
|
||||||
|
# built-in postgress websearch_to_tsquery() is good
|
||||||
|
# https://www.postgresql.org/docs/11/textsearch-controls.html#TEXTSEARCH-PARSING-QUERIES
|
||||||
|
# TODO: add requested search columns [0]
|
||||||
|
items_table = inspect(models.Item).local_table
|
||||||
|
result = (
|
||||||
|
db.query(models.Item)
|
||||||
|
.filter(
|
||||||
|
items_table.c.ts.op("@@")(func.websearch_to_tsquery('"russian"', needle))
|
||||||
|
)
|
||||||
|
.order_by(items_table.c.id)
|
||||||
|
.offset(skip)
|
||||||
|
.limit(limit)
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def get_items(db: Session, skip: int = 0, limit: int = 20):
|
||||||
|
"""список доступных в системе Описаний наборов данных"""
|
||||||
|
return db.query(models.Item).offset(skip).limit(limit).all()
|
||||||
|
|
||||||
|
|
||||||
|
def insert_items(db: Session, items: List[schemas.ItemCreate]):
|
||||||
|
"""импорт в базу данных Описаний наборов данных"""
|
||||||
|
before = db.query(models.Item).count()
|
||||||
|
accepted = len(items)
|
||||||
|
# https://docs.sqlalchemy.org/en/14/dialects/postgresql.html#psycopg2-fast-execution-helpers
|
||||||
|
modeled_items = (models.ItemCreate(**item.dict()) for item in items)
|
||||||
|
db.add_all(modeled_items)
|
||||||
|
db.commit()
|
||||||
|
inserted = db.query(models.Item).count() - before
|
||||||
|
return (accepted, inserted)
|
||||||
|
|
||||||
|
|
||||||
|
def get_headers(db: Session):
|
||||||
|
"""полные заголовки таблиц"""
|
||||||
|
return db.query(models.Header).all()
|
||||||
@ -0,0 +1,18 @@
|
|||||||
|
from sqlalchemy import create_engine
|
||||||
|
from sqlalchemy.ext.declarative import declarative_base
|
||||||
|
from sqlalchemy.orm import sessionmaker
|
||||||
|
|
||||||
|
# Database connection parameters
|
||||||
|
SQALCHEMY_DATABASE_URL = "postgresql+psycopg2://geodata:PASSWORD@postgres/geodata"
|
||||||
|
|
||||||
|
|
||||||
|
engine = create_engine(
|
||||||
|
SQALCHEMY_DATABASE_URL,
|
||||||
|
# https://docs.sqlalchemy.org/en/14/dialects/postgresql.html#psycopg2-connect-arguments
|
||||||
|
executemany_mode="values_only",
|
||||||
|
executemany_values_page_size=10000,
|
||||||
|
executemany_batch_page_size=500,
|
||||||
|
)
|
||||||
|
SessionLocal = sessionmaker(autocommit=False, bind=engine)
|
||||||
|
|
||||||
|
Base = declarative_base()
|
||||||
@ -0,0 +1,139 @@
|
|||||||
|
from datetime import datetime
|
||||||
|
from re import IGNORECASE, sub as substitute
|
||||||
|
from typing import List
|
||||||
|
from urllib.parse import unquote_plus
|
||||||
|
|
||||||
|
from fastapi import Depends, FastAPI, File, HTTPException, UploadFile
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from . import crud, models, schemas, spreadsheet
|
||||||
|
from .database import SessionLocal, engine
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: switch for Alembic [2]
|
||||||
|
models.Base.metadata.create_all(bind=engine)
|
||||||
|
|
||||||
|
app = FastAPI()
|
||||||
|
|
||||||
|
# Dependency
|
||||||
|
def get_db():
|
||||||
|
db = SessionLocal()
|
||||||
|
try:
|
||||||
|
yield db
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
@app.post(
|
||||||
|
"/items/",
|
||||||
|
responses={
|
||||||
|
422: {
|
||||||
|
"description": "Загружен некорректный файл (ожидался .xlsx). "
|
||||||
|
"Неизвестный заголовок. Или обнаружены данные в столбце без заголовка. "
|
||||||
|
"Необходимо использовать файлы установленного образца."
|
||||||
|
},
|
||||||
|
},
|
||||||
|
response_model=schemas.InsertStatus,
|
||||||
|
)
|
||||||
|
def create_items(file: UploadFile = File(...), db: Session = Depends(get_db)):
|
||||||
|
"""импорт в базу данных Описаний наборов данных из Excel файлов"""
|
||||||
|
try:
|
||||||
|
start = datetime.now()
|
||||||
|
# parse spreadsheet into a collection of header and data iterators
|
||||||
|
# TODO: Multiple worksheets per Workbook or one? [3]
|
||||||
|
# assume one for now (second one have strange formatting)
|
||||||
|
with spreadsheet.parse(file=file) as spreadsheet_parse:
|
||||||
|
sheet = spreadsheet_parse[0]
|
||||||
|
# TODO use fullfledged validation framework? [6]
|
||||||
|
# ad-hoc for now:
|
||||||
|
# fetch known headersfrom the database
|
||||||
|
# NB: we never check the actual order of the columns, so we might suffer for it
|
||||||
|
|
||||||
|
# we have to construct a collection of known headers anyway,
|
||||||
|
# might as well make it a set
|
||||||
|
unknown_headers = set(sheet["header"]) - set(
|
||||||
|
dbh.spreadsheet for dbh in crud.get_headers(db=db)
|
||||||
|
)
|
||||||
|
if unknown_headers:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=422,
|
||||||
|
detail="Unknown headers in the spreadsheet: {}. Check the coordinated format".format(
|
||||||
|
unknown_headers
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
# construct a list of schemas.Item items
|
||||||
|
spreadsheet_item_list = [
|
||||||
|
schemas.ItemCreate(
|
||||||
|
**{
|
||||||
|
key: row[i]
|
||||||
|
for i, key in enumerate(schemas.ItemCreate.__fields__.keys())
|
||||||
|
}
|
||||||
|
)
|
||||||
|
for row in sheet["data"]
|
||||||
|
]
|
||||||
|
# dump all the data into database
|
||||||
|
accepted, processed = crud.insert_items(db=db, items=spreadsheet_item_list)
|
||||||
|
|
||||||
|
except spreadsheet.InvalidFileException:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=422, detail="Invalid file upload (expected .xlsx)"
|
||||||
|
)
|
||||||
|
except spreadsheet.DataInUnnamedColumnException:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=422, detail="Data is found in a column with empty header"
|
||||||
|
)
|
||||||
|
|
||||||
|
return schemas.InsertStatus(
|
||||||
|
status="Success" if accepted == processed else "Warning",
|
||||||
|
accepted=accepted,
|
||||||
|
processed=processed,
|
||||||
|
process_time=datetime.now() - start,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get(
|
||||||
|
"/item/{item_id}",
|
||||||
|
response_model=schemas.Item,
|
||||||
|
responses={404: {"description": "Такой набор данных отсутствует"}},
|
||||||
|
)
|
||||||
|
def read_item(item_id: int, db: Session = Depends(get_db)):
|
||||||
|
"""индивидуальные страницы для каждого Описания набора данных"""
|
||||||
|
db_item = crud.get_item(db=db, item_id=item_id)
|
||||||
|
if db_item is None:
|
||||||
|
raise HTTPException(status_code=404, detail="Item not found")
|
||||||
|
return db_item
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/items/", response_model=List[schemas.Item])
|
||||||
|
def read_items(skip: int = 0, limit: int = 20, db: Session = Depends(get_db)):
|
||||||
|
"""список доступных в системе Описаний наборов данных"""
|
||||||
|
return crud.get_items(db=db, skip=skip, limit=limit)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get(
|
||||||
|
"/search/",
|
||||||
|
response_model=List[schemas.Item],
|
||||||
|
responses={400: {"description": "Запрос слишком короткий (минимумл 3 символа)"}},
|
||||||
|
)
|
||||||
|
def search(q: str, skip: int = 0, limit: int = 20, db: Session = Depends(get_db)):
|
||||||
|
"""фильтры для поиска Описаний наборов данных по атрибутам
|
||||||
|
дополнительный возможный синтаксис в запросах преобразуется
|
||||||
|
в поисковый запрос содержащий:
|
||||||
|
- простой текст: переданные слова в любом порядке
|
||||||
|
- "текст в кавычках": переданные слова в указанном порядке
|
||||||
|
- OR ИЛИ: какие-либо из переданных слов
|
||||||
|
- -: не содержащий данного слова
|
||||||
|
"""
|
||||||
|
q = unquote_plus(q)
|
||||||
|
# replace all full ИЛИ words with OR
|
||||||
|
q = substitute(r"\bИЛИ\b", "OR", q, flags=IGNORECASE)
|
||||||
|
if len(q) < 3:
|
||||||
|
raise HTTPException(status_code=400, detail="Query too short")
|
||||||
|
return crud.get_item_by_description(db=db, needle=q, skip=skip, limit=limit)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/headers/", response_model=List[schemas.Header])
|
||||||
|
def read_headers(db: Session = Depends(get_db)):
|
||||||
|
"""полные наименования столбцов таблиц"""
|
||||||
|
return crud.get_headers(db=db)
|
||||||
@ -0,0 +1,50 @@
|
|||||||
|
from sqlalchemy import Column, Integer, String, DateTime
|
||||||
|
|
||||||
|
# we might need vector concat later, then we'll have to bring in sqlalchemy_utils
|
||||||
|
# https://sqlalchemy-utils.readthedocs.io/en/latest/_modules/sqlalchemy_utils/types/ts_vector.html
|
||||||
|
from sqlalchemy.dialects.postgresql import TSVECTOR
|
||||||
|
|
||||||
|
from .database import Base
|
||||||
|
|
||||||
|
|
||||||
|
class Header(Base):
|
||||||
|
__tablename__ = "headers"
|
||||||
|
|
||||||
|
database = Column(String, primary_key=True)
|
||||||
|
spreadsheet = Column(String)
|
||||||
|
|
||||||
|
|
||||||
|
class ItemBase(Base):
|
||||||
|
__tablename__ = "geodata"
|
||||||
|
|
||||||
|
id = Column(Integer, primary_key=True, index=True)
|
||||||
|
fadr = Column(String)
|
||||||
|
oldid = Column(String)
|
||||||
|
category = Column(String)
|
||||||
|
basin = Column(String)
|
||||||
|
deposit = Column(String)
|
||||||
|
well = Column(String)
|
||||||
|
depth = Column(String)
|
||||||
|
stratum = Column(String)
|
||||||
|
owner = Column(String)
|
||||||
|
org = Column(String)
|
||||||
|
ownercontacts = Column(String)
|
||||||
|
samplelist = Column(String)
|
||||||
|
description = Column(String)
|
||||||
|
formdimentions = Column(String)
|
||||||
|
datalist = Column(String)
|
||||||
|
resolution = Column(String)
|
||||||
|
date = Column(String)
|
||||||
|
additionalinfo = Column(String)
|
||||||
|
scanner = Column(String)
|
||||||
|
comment = Column(String)
|
||||||
|
continuation = Column(String)
|
||||||
|
|
||||||
|
|
||||||
|
class ItemCreate(ItemBase):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class Item(ItemBase):
|
||||||
|
|
||||||
|
geodata_search_ts = Column(TSVECTOR)
|
||||||
@ -0,0 +1,69 @@
|
|||||||
|
from datetime import datetime, timedelta
|
||||||
|
from typing import Optional, Union
|
||||||
|
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
|
||||||
|
class HeaderBase(BaseModel):
|
||||||
|
database: str
|
||||||
|
spreadsheet: str
|
||||||
|
|
||||||
|
|
||||||
|
class HeaderCreate(HeaderBase):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class Header(HeaderBase):
|
||||||
|
"""Заголовок описания набора геологических данных
|
||||||
|
по результатам экспедиционных исследований"""
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
orm_mode = True
|
||||||
|
|
||||||
|
|
||||||
|
class ItemBase(BaseModel):
|
||||||
|
"""Описание набора геологических данных
|
||||||
|
по результатам экспедиционных исследований
|
||||||
|
"""
|
||||||
|
|
||||||
|
fadr: Optional[str] = None
|
||||||
|
oldid: Optional[str] = None
|
||||||
|
category: Optional[str] = None
|
||||||
|
basin: Optional[str] = None
|
||||||
|
deposit: Optional[str] = None
|
||||||
|
well: Optional[str] = None
|
||||||
|
depth: Optional[str] = None
|
||||||
|
stratum: Optional[str] = None
|
||||||
|
owner: Optional[str] = None
|
||||||
|
org: Optional[str] = None
|
||||||
|
ownercontacts: Optional[str] = None
|
||||||
|
samplelist: Optional[str] = None
|
||||||
|
description: Optional[str] = None
|
||||||
|
formdimentions: Optional[str] = None
|
||||||
|
datalist: Optional[str] = None
|
||||||
|
resolution: Optional[str] = None
|
||||||
|
date: Optional[Union[datetime, str]] = None
|
||||||
|
additionalinfo: Optional[str] = None
|
||||||
|
scanner: Optional[str] = None
|
||||||
|
comment: Optional[str] = None
|
||||||
|
continuation: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
class ItemCreate(ItemBase):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class Item(ItemBase):
|
||||||
|
id: int
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
orm_mode = True
|
||||||
|
|
||||||
|
|
||||||
|
class InsertStatus(BaseModel):
|
||||||
|
"""basic insertation status metrics"""
|
||||||
|
|
||||||
|
status: str
|
||||||
|
accepted: int
|
||||||
|
processed: int
|
||||||
|
process_time: timedelta
|
||||||
@ -0,0 +1,103 @@
|
|||||||
|
from openpyxl import Workbook, load_workbook
|
||||||
|
from openpyxl.utils.exceptions import InvalidFileException
|
||||||
|
from fastapi import File, UploadFile
|
||||||
|
|
||||||
|
from asyncio import run as asyncio_run
|
||||||
|
from contextlib import closing, contextmanager
|
||||||
|
from itertools import chain
|
||||||
|
from os import remove
|
||||||
|
from pathlib import Path
|
||||||
|
from shutil import copyfileobj
|
||||||
|
from tempfile import NamedTemporaryFile
|
||||||
|
|
||||||
|
|
||||||
|
class DataInUnnamedColumnException(Exception):
|
||||||
|
"""all the columns containing any data have to me named"""
|
||||||
|
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def parse(file: UploadFile = File(...)):
|
||||||
|
"""returns a dict with a pair of iterators
|
||||||
|
for each sheet in the spreadsheet in a list
|
||||||
|
|
||||||
|
[
|
||||||
|
{"header": header_iterator,"data": data_iterator}, # sheet1
|
||||||
|
{"header": header_iterator,"data": data_iterator}, # sheet2
|
||||||
|
# etc
|
||||||
|
]
|
||||||
|
"""
|
||||||
|
# prepare return list
|
||||||
|
result = []
|
||||||
|
|
||||||
|
suffix = Path(file.filename).suffix
|
||||||
|
try:
|
||||||
|
# TODO: decide if we use subdir in /tmp here, then create it [5]
|
||||||
|
with NamedTemporaryFile(delete=False, suffix=suffix, dir="/tmp/notgeo") as tmp:
|
||||||
|
copyfileobj(file.file, tmp)
|
||||||
|
spreadsheet_file = Path(tmp.name)
|
||||||
|
|
||||||
|
# Unlike a normal workbook, a read-only workbook will use lazy loading.
|
||||||
|
# The workbook must be explicitly closed with the close() method.
|
||||||
|
|
||||||
|
# we use contextlib.closing() to do it for us
|
||||||
|
with closing(load_workbook(spreadsheet_file, read_only=True)) as wb:
|
||||||
|
# TODO: Multiple worksheets per Workbook or one? [3]
|
||||||
|
# assume one for now (second one have strange formatting)
|
||||||
|
ws = wb.active
|
||||||
|
# assume headers are in the top row
|
||||||
|
header = [
|
||||||
|
cell
|
||||||
|
for row in ws.iter_rows(max_row=1, values_only=True)
|
||||||
|
for cell in row
|
||||||
|
]
|
||||||
|
|
||||||
|
# assume data stretch is continuous
|
||||||
|
# find first occurence of None in header and assert that
|
||||||
|
# no unnamed column contains any data
|
||||||
|
# .index is 0-based and min_col= is 1-based so we want to
|
||||||
|
# go for header.index(None)+1
|
||||||
|
# unpack row generator into separate cell iterators, chain them together,
|
||||||
|
# and make sure none of the cells contains any data
|
||||||
|
try:
|
||||||
|
last_column_with_header = header.index(None)
|
||||||
|
if any(
|
||||||
|
chain(
|
||||||
|
*(
|
||||||
|
ws.iter_rows(
|
||||||
|
min_col=last_column_with_header + 1, values_only=True
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
):
|
||||||
|
raise DataInUnnamedColumnException(
|
||||||
|
"Data is found in a column with empty header"
|
||||||
|
)
|
||||||
|
except ValueError:
|
||||||
|
# header.index(None) couldn't find anything,
|
||||||
|
# all cell in the first row are filled,
|
||||||
|
# so all columns have headers,
|
||||||
|
# we can safely continue
|
||||||
|
last_column_with_header = len(header)
|
||||||
|
|
||||||
|
# Construct spreadsheet data iterator
|
||||||
|
# .index is 0-based and max_col= is 1-based so we might've wanted to
|
||||||
|
# go for header.index(None)+1, but max_col= range is including
|
||||||
|
# and we want to only include non-empty columns, and go for
|
||||||
|
# the previous one (-1) so in the end
|
||||||
|
# max_col=header.index(None)+1-1
|
||||||
|
data = ws.iter_rows(
|
||||||
|
min_row=2, max_col=last_column_with_header, values_only=True
|
||||||
|
)
|
||||||
|
result.append(
|
||||||
|
{"header": (cell for cell in header if cell is not None), "data": data}
|
||||||
|
)
|
||||||
|
# END [3]
|
||||||
|
|
||||||
|
yield result
|
||||||
|
finally:
|
||||||
|
# clean up by explicitly closing the files and removing temporary spreadsheet
|
||||||
|
asyncio_run(file.close())
|
||||||
|
tmp.close()
|
||||||
|
remove(spreadsheet_file)
|
||||||
Loading…
Reference in new issue